library(tidyverse)
## ── Attaching packages ────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
## 
##     plot
library(censusapi)
## 
## Attaching package: 'censusapi'
## The following object is masked from 'package:methods':
## 
##     getFunction
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(mapview)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lehdr)

options(
  tigris_class = "sf",
  tigris_use_cache = T 
)

Loading Blockgroups and Social Distancing Data

The code below loads in the Safegraph Social Distancing dataset.

scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F)

# Below uses tracts sent to us by San Jose
sj_tracts <- st_read("/users/ctenner/Pcloud Drive/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp") %>% 
  st_as_sf() %>% 
  st_transform(4269)
## Reading layer `CSJ_Census_Tracts' from data source `/Users/ctenner/pCloud Drive/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp' using driver `ESRI Shapefile'
## Simple feature collection with 219 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_citycouncil_disticts <- st_read("/Users/ctenner/Desktop/Classes/118/City Council Districts/CITY_COUNCIL_DISTRICTS.shp") %>% 
  st_as_sf() %>% 
  st_transform(4269)
## Reading layer `CITY_COUNCIL_DISTRICTS' from data source `/Users/ctenner/Desktop/Classes/118/City Council Districts/CITY_COUNCIL_DISTRICTS.shp' using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_blockgroups <- 
  scc_blockgroups %>% 
  st_centroid() %>% 
  st_join(sj_tracts, left = F) %>% 
  st_join(sj_citycouncil_disticts%>% dplyr::select(DISTRICTS)) %>% 
  mutate(
    DISTRICTS = DISTRICTS %>% factor(levels = c("1","2","3","4","5","6","7","8","9","10"))
  ) %>% 
  st_set_geometry(NULL) %>% 
  left_join(scc_blockgroups%>% dplyr::select(GEOID), by = "GEOID") %>% 
  st_as_sf() %>% 
  dplyr::select(GEOID, DISTRICTS)
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
# the spatial join leaves off two blockgroups which are touching district 9. The following code assigns those to district 9
sj_blockgroups$DISTRICTS[is.na(sj_blockgroups$DISTRICTS)] <- 9

mapview(sj_blockgroups, zcol = "DISTRICTS")
sj_socialdistancing <- readRDS("/users/ctenner/Pcloud Drive/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_socialdistancing.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date()) %>% 
  left_join(sj_blockgroups, by = c("origin_census_block_group"="GEOID")
  ) %>% 
  filter(!is.na(DISTRICTS)) %>% 
  dplyr::select(-geometry)

Median Household Income Analysis

My first analysis looks at the median household income of each block group in San Jose and how it correlates to the percent of residents who have been staying completely at home in the past three days.

Sys.setenv(CENSUS_KEY=" 6c05445ae84c23e1c62fd91756d0f56f51ae94ef")

# below code creates a "lookup" table that will be used later on
acs_vars <-
  listCensusMetadata(
    name = "2018/acs/acs5",
    type = "variables"
  )

# following code loads in census data and processes it to be more readable
sj_median_income_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "B19013_001E",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  rename(
    Median_Income = B19013_001E 
  ) %>% 
  filter(!is.na(Median_Income)) %>% 
  left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
  dplyr::select(-geometry) %>% # comment out this line if geometry is desired 
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  
  # this code joins our census data with the social distancing data, processed as shown below
  left_join(sj_socialdistancing %>%  
                          filter(date > max(date)-3) %>% 
                          group_by(origin_census_block_group) %>% 
                          summarize(
                                    completely_home_device_count = sum(completely_home_device_count),
                                    device_count = sum(device_count)) %>% 
                          mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
            by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(
    !is.na(device_count)
  ) 


# now we can create a scatter plot of median income v % completely at home
sj_median_income_by_block %>% 
  filter(Median_Income > 0) %>% 
  ggplot(aes(
  x = Median_Income,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Median Household Income",
    y = "% residents completely at home in past 3 days",
    title = "San Jose: Social Distancing and Median Household Income"
  )
## `geom_smooth()` using formula 'y ~ x'

Percent Below Federal Poverty Line Analysis

My second analysis looks at the percent of households below the federal poverty line in each block group and how this correlates with the percent of residents completly at home in the past three days.

sj_poverty_by_block <-
  # first we need to load census data. I'm loading in a table of poverty status by household type
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B17017)",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  
  # The following code shows my process of selecting the right variables from the census data
  gather(
    key = code,
    value = number,
    - blockgroup
  ) %>% 
  left_join(acs_vars %>% select_if(names(.) %in% c("name", "label")), 
            by = c("code" = "name")) %>% 

  # at this point I was able to identify the two variables I needed for analysis
  filter(
    code %in% c("B17017_001E", "B17017_002E")
  ) %>% 
  select(-c("label")) %>% 
  spread(code, number) %>% 
  rename(
    Households_Total = B17017_001E,
    Households_Under_Povertyline = B17017_002E
  ) %>% 
  
  # now we can add a column for % of households below poverty line and combine it with the sj_socialdistancing df
  mutate(
    Households_Total = as.numeric(Households_Total),
    Households_Under_Povertyline = as.numeric(Households_Under_Povertyline),
    `% households below poverty line` = Households_Under_Povertyline / Households_Total * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
    #sj_socialdistancing %>%  
                          #filter(date > max(date)-3) %>% 
                          #group_by(origin_census_block_group) %>% 
                          #summarize(
                                    #completely_home_device_count = sum(completely_home_device_count),
                                    #device_count = sum(device_count)) %>% 
                          #mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
            #by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(
    !is.na(device_count)  # this takes out block groups that aren't in SJ
  ) %>% 
  mutate(
    log_of_poverty = log(as.numeric(`% households below poverty line`))
  )
## Joining, by = "blockgroup"
# now we can look at the relationship between % completely at home and % under poverty line on a scatter plot
sj_poverty_by_block %>% ggplot(aes(
  x = `% households below poverty line`,
  y = `% Completely at Home`
)) + 
  geom_point() +
  labs(
    y = "% residents completely at home in past 3 days",
    x = log(as.numeric("% households below poverty line")),
    title = "San Jose: Social Distancing and Poverty Status"
  )
## Warning in list2(..., title = title, subtitle = subtitle, caption = caption, :
## NAs introduced by coercion

The results are heteroscedastic, so we take the log of the x-variable

sj_poverty_by_block %>% 
  filter(
    log_of_poverty > 0  # taking out blockgroups with no poverty 
  ) %>% 
  ggplot(aes(
  x = log_of_poverty,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    y = "Percent of devices completely at home in past 3 days",
    x = log(as.numeric("Percent of households below poverty line")),
    title = "San Jose: Social Distancing and Poverty Status"
  )
## Warning in list2(..., title = title, subtitle = subtitle, caption = caption, :
## NAs introduced by coercion
## `geom_smooth()` using formula 'y ~ x'

Percent Below 50%, 80% of AMI

Using area median income (AMI) is a more appropriate way to measure low-income status, as the cost of living is substantially higher than the federal average in San Jose. This analysis plots percent of devices entirely at home against percent of households with incomes 50% and 80% percent of Santa Clara County AMI.

sj_ami_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B19001)",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
  group_by(blockgroup) %>% 
  summarize(
    Total = B19001_001E,
    `Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
    #sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
    `Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
  ) %>% 
  mutate(
    `% under 75,000` = `Under 75,000` / Total * 100,
    `% under 100,000` = `Under 100,000` / Total * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
    #sj_socialdistancing %>%  
                          #filter(date > max(date)-3) %>% 
                          #group_by(origin_census_block_group) %>% 
                          #summarize(
                                    #completely_home_device_count = sum(completely_home_device_count),
                                    #device_count = sum(device_count)) %>% 
                          #mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
            #by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(!is.na(device_count))
## Joining, by = "blockgroup"
cor(sj_ami_by_block$`% under 75,000`, sj_ami_by_block$`% Completely at Home`)
## [1] -0.4182122
sj_ami_by_block %>% ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of households with incomes under $75,000 (50% AMI) annually",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Households Above and Below 50% AMI"
  )
## `geom_smooth()` using formula 'y ~ x'

sj_ami_by_block %>% ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of households with incomes under $100,000 (80% AMI) annually",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Households Above and Below 80% AMI"
  )
## `geom_smooth()` using formula 'y ~ x'

Using LODES Data

In this part of my analysis I load in a LODES dataset on SJ in order to compare percent of devices at home to a number of economic and employment factors. The

ca_rac <- 
  grab_lodes(
    state = "ca", 
    year = 2017, 
    lodes_type = "rac", 
    job_type = "JT01",
    segment = "S000", 
    state_part = "main",
    agg_geo = "bg"
  ) %>% 
  left_join(sj_blockgroups, by = c("h_bg" = "GEOID")) %>% 
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  mutate(
    percent_healthcare_workers = CNS18 / C000 * 100
  ) %>% 
  dplyr::select(h_bg, DISTRICTS, percent_healthcare_workers) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income), by = c("h_bg" = "blockgroup")) %>% 
  filter(
    !is.na(device_count)
  )
## Cached version of file found in /Users/ctenner/Desktop/Classes/118/Github/covid19/Cameron_Tenner/lodes_raw/ca_rac_S000_JT01_2017.csv.gz
## Reading now...
ca_rac %>% ggplot(aes(
  x = percent_healthcare_workers,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of workers in health care and social assistance",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Health Care Workers"
  )
## `geom_smooth()` using formula 'y ~ x'